In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
In [ ]:
## 1.Cleaning up the dataset by renaming columns and removing unnecessary rows.
## 2.Converting date columns to the appropriate format.
## 3.Converting relevant columns to numeric or boolean types for analysis.
In [85]:
## Load The Data Set
file_path = '/Users/cactusjack/Downloads/Delta Airlines Loyalty Customer Raw Data - Frequent Flier Raw.csv'
delta_data = pd.read_csv(file_path)
In [33]:
file_path1 = '/Users/cactusjack/Downloads/Delta Airlines Loyalty Customer Raw Data - Flier Upcoming Flights Raw.csv'
delta_upcoming_flights = pd.read_csv(file_path1)
In [34]:
delta_upcoming_flights.head()
Out[34]:
Join Date Frequent Flier Number First Name Last Name Flight Number Flight Date Departing City Arrival City Departure Time Arrival Time Departure Time Zone Arrival Time Zone Cost Car Rental Addon $ Hotel Addon $
0 2009-10-07 9531726184 Megan Brown DL7263 2024-11-04 Berlin Miaml 4:38 9:38 UTC CST 998.55 BRL 197.5 BRL 419.24 BRL
1 1996-07-29 7654154528 William Lee DL685k 2024-11-04 Dubai Hong Kong 22:38 8:38 UTC UTk 283.62 AUD 256.04 AUD 302
2 1990-03-11 1035783281 Courtney Valdez DL8569 2025-01-03 Houston Paris 0:38 10:38 PST UTC 335.82 GBP 117.56 GBP 263.57 GBP
3 2002-12-16 787669065 Erik Schneider DL7894 2025-07-21 New York Miami 23:38 10:38 EST PSy 686.64 USD 77.29 USD 197.77 USD
4 NaN 2871854617 Alexander Johnson DL6365 2025-01-21 Dallas New York 2:38 4:38 CST EST 601.82 USD 161.71 USD 197.69 USD
In [35]:
new_column_names = ['Record_Creation', 'Join_Date', 'First_Name', 'Last_Name', 'Frequent_Flier_Number', 'Flight_Number', 'Departure_Airport', 'Arrival_Airport', 'Flight_Date', 'Flight_Time', 'Seat_Number', 'Meal_Preference', 'Checked_Bags', 'Special_Assistance', 'Flight_Status']
delta_upcoming_flights.columns = new_column_names
In [37]:
delta_upcoming_flights = delta_upcoming_flights.dropna(subset=['Frequent_Flier_Number', 'Flight_Number'])
In [57]:
delta_upcoming_flights.dtypes
Out[57]:
Record_Creation                  object
Join_Date                         int64
First_Name                       object
Last_Name                        object
Frequent_Flier_Number            object
Flight_Number                    object
Departure_Airport                object
Arrival_Airport                  object
Flight_Date              datetime64[ns]
Flight_Time                      object
Seat_Number                      object
Meal_Preference                  object
Checked_Bags                     object
Special_Assistance               object
Flight_Status                    object
dtype: object
In [56]:
delta_upcoming_flights['Flight_Date'] = pd.to_datetime(delta_upcoming_flights['Flight_Date'], errors='coerce')
In [59]:
delta_upcoming_flights['Checked_Bags'] = pd.to_numeric(delta_upcoming_flights['Checked_Bags'], errors='coerce')
In [38]:
delta_upcoming_flights['Meal_Preference'].fillna('No Preference', inplace=True)
delta_upcoming_flights['Seat_Number'].fillna('Unassiged', inplace=True)
In [64]:
delta_upcoming_flights.head()
Out[64]:
Record_Creation Join_Date First_Name Last_Name Frequent_Flier_Number Flight_Number Departure_Airport Arrival_Airport Flight_Date Flight_Time Seat_Number Meal_Preference Checked_Bags Special_Assistance Flight_Status
0 2009-10-07 9531726184 Megan Brown DL7263 2024-11-04 Berlin Miaml 2024-11-01 04:38:00 9:38 UTC CST NaN 197.5 BRL 419.24 BRL
1 1996-07-29 7654154528 William Lee DL685k 2024-11-04 Dubai Hong Kong 2024-11-01 22:38:00 8:38 UTC UTk NaN 256.04 AUD 302
2 1990-03-11 1035783281 Courtney Valdez DL8569 2025-01-03 Houston Paris 2024-11-01 00:38:00 10:38 PST UTC NaN 117.56 GBP 263.57 GBP
3 2002-12-16 787669065 Erik Schneider DL7894 2025-07-21 New York Miami 2024-11-01 23:38:00 10:38 EST PSy NaN 77.29 USD 197.77 USD
4 NaN 2871854617 Alexander Johnson DL6365 2025-01-21 Dallas New York 2024-11-01 02:38:00 4:38 CST EST NaN 161.71 USD 197.69 USD
In [ ]:
 
In [39]:
file_path2 = '/Users/cactusjack/Downloads/Delta Airlines Loyalty Customer Raw Data - 3 Month Upcoming Flights Raw.csv'
delta_3month_upcoming_flight = pd.read_csv(file_path2)
delta_3month_upcoming_flight.head()
Out[39]:
Flight_ID Flight Date Departure City Arrival City Departure Time Arrival Time Departure Time Zone Arrival Time Zone Airline Flight Number ... Baggage Allowance Flight Duration Layovers Cancellation Policy Booking Window Meal Options In-Flight Entertainment Aircraft Type Flight Status Loyalty Points Earned
0 FL22499 2024-09-11 Dallas Berlin 22:4j 1:46 NaN UTC Delta Un5086 ... 2 bags 8 hours NaN Refundable 6 days Gluten-Free Movies Airbus A380 On Time 2274
1 FL62773 2024-11-06 Sydney Hong Kong 5:46 12:46 UTC UTC Lufthansa Qa2994 ... 2 bags 6 hours Toronto for 3 hours Refundable 9 days Non-Vegetarian Movies Boeing 787 On Time 4337
2 FL96338 2024-09-12 Los Angeleu Atlanta 6:46 17:46 PST MST Lufthansa Em9455 ... 2 bags 15 hourj NaN Non-Refundable 18 days Gluten-Free None Airbus A380 Delayed 164
3 FL85371 2024-09-20 San Francisco Tokyo 7:46 13:4l UTC UTC British Airways Br6054 ... 1 bags 15 hours NaN NaN 19 days Non-Vegetarian Movies Airbus A320 Delayed 2711
4 FL65244 2024-10-02 Miami San Francisco 10:46 20:46 PST UTC Air Canada Br3533 ... 3 bags 9 hours Hong Kong for 5 hours Non-Refundable 10 days Non-Vegetarian Movies Airbus A320 Scheduled 2451

5 rows × 24 columns

In [40]:
delta_data.head()
Out[40]:
Unnamed: 0 MAKE A COPY TO BEGIN YOUR WORK. THIS IS A READ ONLY FILE Unnamed: 2 Unnamed: 3 Unnamed: 4 Unnamed: 5 Unnamed: 6 Unnamed: 7 Unnamed: 8 Unnamed: 9 Unnamed: 10 Unnamed: 11 Unnamed: 12 Unnamed: 13 Unnamed: 14
0 Record Creation Frequent Flier Number Join Date First Name Last Name Inquiry Type Most Recent Flight # Lounge Used? Departing Airport Arrival Airport Planned Snack? Additional Snack? # of Included Checked Bags Total # of Checked Bags Flight Delayed?
1 2024-02-27 7234617746 1988/07/08 debbie spears Flight Status G35206241 FALSE DFW DEN FALSE TRUE 0 16 TRUE
2 2021-03-07 7234617746 1988/07/08 debbie Spears New Flight V81311927 TRUE JFK SFO FALSE TRUE 1 21 FALSE
3 2023-01-31 7234617746 07/08/1988 debbie spears Cancel Flight H82295055 FALSE MIA ORD FALSE FALSE 7 5 TRUE
4 2021-08-21 551121699 06/16/2022 kyle Boone New Flight E39886834 TRUE ttt NaN FALSE TRUE NaN 8 FALSE
In [61]:
## Clean and Reformat the dataset
## Rename the columns
## Dropping the row
new_columns = ["Record_Creation", "Frequent_Flier_Number", "Join_Date", "First_Name", "Last_Name", "Inquiry_Type", "Most_Recent_Flight_Number", "Lounge_Used", "Departing_Airport", "Arrival_Airport", "Planned_Snack", "Additional_Snack", "Included_Checked_Bags", "Total_Checked_Bags", "Flight_Delayed"]
delta_data_Cleaned = delta_data.drop(index=0)
delta_data_Cleaned.columns = new_columns
In [27]:
## Convert Join_Date to datetime format and handle any errors
delta_data_Cleaned['Join_Date'] = pd.to_datetime(delta_data_Cleaned['Join_Date'], errors='coerce', format='%Y-%M-%D')
In [42]:
## Clean and reset index for further analysis
delta_data_Cleaned = delta_data_Cleaned.reset_index(drop=True)
In [43]:
## Check for missing or inconsistent data
## Convert boolean-like columns to proper boolean values
delta_data_Cleaned['Lounge_Used'] = delta_data_Cleaned['Lounge_Used'].apply(lambda x: x == 'TRUE')
delta_data_Cleaned['Flight_Delayed'] = delta_data_Cleaned['Flight_Delayed'].apply(lambda x: x == 'TRUE')
In [44]:
## Convert numeric columns to numeric, handle errors 
delta_data_Cleaned['Included_Checked_Bags'] = pd.to_numeric(delta_data_Cleaned['Included_Checked_Bags'], errors='coerce')
delta_data_Cleaned['Total_Checked_Bags'] = pd.to_numeric(delta_data_Cleaned['Total_Checked_Bags'], errors='coerce')
delta_data_Cleaned.head()
Out[44]:
Record_Creation Frequent_Flier_Number Join_Date First_Name Last_Name Inquiry_Type Most_Recent_Flight_Number Lounge_Used Departing_Airport Arrival_Airport Planned_Snack Additional_Snack Included_Checked_Bags Total_Checked_Bags Flight_Delayed
0 2024-02-27 7234617746 1988/07/08 debbie spears Flight Status G35206241 False DFW DEN FALSE TRUE 0.0 16.0 True
1 2021-03-07 7234617746 1988/07/08 debbie Spears New Flight V81311927 True JFK SFO FALSE TRUE 1.0 21.0 False
2 2023-01-31 7234617746 07/08/1988 debbie spears Cancel Flight H82295055 False MIA ORD FALSE FALSE 7.0 5.0 True
3 2021-08-21 551121699 06/16/2022 kyle Boone New Flight E39886834 True ttt NaN FALSE TRUE NaN 8.0 False
4 2022-03-01 551121699 16-06-2022 KYLE Boone New Flight R04390271 False SFO DFW FALSE FALSE 3.0 17.0 False
In [45]:
## Customer Segmentation
## Group by Frequent Flyer Number to calculate flight frequency, lounge usage, and other relevant metrics
## Count the number of flights per customer 
## Sum of lounge usages (True = 1, False = 0)
## Sum of total checked bags
## Count number of delayed flights
customer_segmentation = delta_data_Cleaned.groupby('Frequent_Flier_Number').agg({'Record_Creation': 'count', 'Lounge_Used': lambda x: x.sum(), 'Total_Checked_Bags': 'sum', 'Flight_Delayed': lambda x: x.sum()}).reset_index()
In [46]:
## Lets rename columns for clarity
customer_segmentation.columns = ['Frequent_Flier_Number', 'Flight_Frequency', 'Lounge_Users', 'Total_Checked_Bags', 'Delayed_Flights']
print(customer_segmentation.head())
  Frequent_Flier_Number  Flight_Frequency  Lounge_Users  Total_Checked_Bags  \
0            1007638577                 5             3                45.0   
1            1007847878                 9             8                85.0   
2            1013150620                 4             2                35.0   
3            1015849337                 5             3                53.0   
4            1026781405                 4             4                54.0   

   Delayed_Flights  
0                2  
1                7  
2                1  
3                2  
4                3  
In [93]:
customer_segmentation.head()
Out[93]:
Frequent_Flier_Number Flight_Frequency Lounge_Users Total_Checked_Bags Delayed_Flights
0 1007638577 5 3 45.0 2
1 1007847878 9 8 85.0 7
2 1013150620 4 2 35.0 1
3 1015849337 5 3 53.0 2
4 1026781405 4 4 54.0 3
In [54]:
## Flight Delay Analysis by Departure and Arrival Airports 
## Calculate the number of delayed flights by departing airport
## Count delayed flights
## Total number of flights
delays_by_departure_airport = delta_data_Cleaned.groupby('Departing_Airport').agg({'Flight_Delayed': lambda x: x.sum(), 'Record_Creation': 'count'}).reset_index()
In [55]:
## Calculate the percentage of delays at each departing airport
## Rename columns
## Sort by highest percentage of delays
delays_by_departure_airport['Delay_Percentage'] = (delays_by_departure_airport['Flight_Delayed'] / delays_by_departure_airport['Record_Creation']) * 100
delays_by_departure_airport.columns = ['Departing_Airport', 'Delayed_Flights', 'Total_Flights', 'Delay_Percentage']
delays_by_departure_airport = delays_by_departure_airport.sort_values(by='Delay_Percentage', ascending=False)
print(delays_by_departure_airport.head())
  Departing_Airport  Delayed_Flights  Total_Flights  Delay_Percentage
6               MIA              426            785         54.267516
3               DFW              425            822         51.703163
9               SFO              418            811         51.541307
2               DEN              406            799         50.813517
7               ORD              434            858         50.582751
In [23]:
print(delta_data_Cleaned.columns)
Index(['Record_Creation', 'Frequent_Flier_Number', 'Join_Date', 'First_Name',
       'Last_Name', 'Inquiry_Type', 'Most_Recent_Flight_Number', 'Lounge_Used',
       'Departing_Airport', 'Arrival_Airport', 'Planned_Snack',
       'Additional_Snack', 'Included_Checked_Bags', 'Total_Checked_Bags',
       'Flight_Delayed'],
      dtype='object')
In [56]:
delta_data_Cleaned.columns = delta_data_Cleaned.columns.str.strip()
In [27]:
print(delta_data_Cleaned.columns.to_list())
['Record_Creation', 'Frequent_Flier_Number', 'Join_Date', 'First_Name', 'Last_Name', 'Inquiry_Type', 'Most_Recent_Flight_Number', 'Lounge_Used', 'Departing_Airport', 'Arrival_Airport', 'Planned_Snack', 'Additional_Snack', 'Included_Checked_Bags', 'Total_Checked_Bags', 'Flight_Delayed']
In [31]:
## Flight Delay Analysis by Arrival Airport
delays_by_arrival_airport = delta_data_Cleaned.groupby('Arrival_Airport').agg({'Flight_Delayed': lambda x: x.sum(), 'Record_Creation': 'count'}).reset_index()
delays_by_arrival_airport['Delay_Percentage'] = (delays_by_arrival_airport['Flight_Delayed'] / delays_by_arrival_airport['Record_Creation']) * 100
delays_by_arrival_airport.columns = ['Arrival_Airport', 'Delayed_Flights', 'Total_Flights', 'Delay_Percentage']
delays_by_arrival_airport = delays_by_arrival_airport.sort_values(by='Delay_Percentage', ascending=False)
In [32]:
print("Delays by Departure Airport:")
print(delays_by_departure_airport.head())
Delays by Departure Airport:
  Departing_Airport  Delayed_Flights  Total_Flights  Delay_Percentage
6               MIA              426            785         54.267516
3               DFW              425            822         51.703163
9               SFO              418            811         51.541307
2               DEN              406            799         50.813517
7               ORD              434            858         50.582751
In [33]:
print("\nDelays by Arrival Airport:")
print(delays_by_arrival_airport.head())
Delays by Arrival Airport:
  Arrival_Airport  Delayed_Flights  Total_Flights  Delay_Percentage
4             JFK              416            793         52.459016
8             SEA              434            841         51.605232
6             MIA              424            837         50.657109
1             BOS              407            804         50.621891
7             ORD              395            783         50.446999
In [50]:
## Route analysis by grouping both Departure and Arrival airports
## Count of delayed flights for each route
## Total number of flights for each route
## The delay percentage for each route
route_analysis = delta_data_Cleaned.groupby(['Departing_Airport', 'Arrival_Airport']).agg({'Flight_Delayed': lambda x: x.sum(), 'Record_Creation': 'count'}).reset_index()
route_analysis['Delay_Percentage'] = (route_analysis['Flight_Delayed'] / route_analysis['Record_Creation']) * 100
route_analysis.columns = ['Departing_Airport', 'Arrival_Airport', 'Delayed_Flight', 'Total_Flights', 'Delay_Percentage']
route_analysis = route_analysis.sort_values(by='Delay_Percentage', ascending=False)
In [35]:
route_analysis.head()
Out[35]:
Departing_Airport Arrival_Airport Delayed_Flight Total_Flights Delay_Percentage
33 DFW JFK 54 81 66.666667
69 MIA YYY 23 36 63.888889
60 MIA ATL 48 76 63.157895
107 ttt ORD 26 42 61.904762
97 SFO ORD 44 72 61.111111
In [38]:
route_analysis.to_csv('route_analysis.csv', index=False)
In [39]:
## Descriptive Analysis
## Summary statistics for customer segmentation
## Summary statistics for flight delays by departure airport
## Summary statistics for flight delays by arrival airport
## Summary statistics for route delays
customer_stats = customer_segmentation.describe()
departure_delay_stats = delays_by_arrival_airport.describe()
arrival_delay_stats = delays_by_arrival_airport.describe()
route_delay_stats = route_analysis.describe()
In [40]:
print("Customer Segmentation Statistics:\n", customer_stats)
print("\nDeparture Airport Delay Statistics:\n", departure_delay_stats)
print("\nArrival Airport Delay Statistics:\n", arrival_delay_stats)
print("\nRoute Delay Statistics:\n", route_delay_stats)
Customer Segmentation Statistics:
        Flight_Frequency  Lounge_Users  Total_Checked_Bags  Delayed_Flights
count       2000.000000   2000.000000              2000.0      2000.000000
mean           4.364500      2.139500                 0.0         2.173500
std            1.872537      1.427256                 0.0         1.473935
min            1.000000      0.000000                 0.0         0.000000
25%            3.000000      1.000000                 0.0         1.000000
50%            4.000000      2.000000                 0.0         2.000000
75%            5.000000      3.000000                 0.0         3.000000
max           10.000000      9.000000                 0.0         9.000000

Departure Airport Delay Statistics:
        Delayed_Flights  Total_Flights  Delay_Percentage
count        11.000000      11.000000         11.000000
mean        397.363636     795.363636         49.966270
std          57.933190     115.577050          1.348114
min         226.000000     456.000000         47.846890
25%         403.500000     798.500000         49.129693
50%         415.000000     836.000000         49.561404
75%         420.000000     841.500000         50.639500
max         434.000000     872.000000         52.459016

Arrival Airport Delay Statistics:
        Delayed_Flights  Total_Flights  Delay_Percentage
count        11.000000      11.000000         11.000000
mean        397.363636     795.363636         49.966270
std          57.933190     115.577050          1.348114
min         226.000000     456.000000         47.846890
25%         403.500000     798.500000         49.129693
50%         415.000000     836.000000         49.561404
75%         420.000000     841.500000         50.639500
max         434.000000     872.000000         52.459016

Route Delay Statistics:
        Delayed_Flight  Total_Flights  Delay_Percentage
count      111.000000     111.000000        111.000000
mean        37.342342      74.891892         49.823391
std         10.295185      18.108939          6.482699
min         13.000000      29.000000         36.363636
25%         33.000000      72.000000         45.235394
50%         38.000000      80.000000         49.411765
75%         44.000000      86.000000         55.223214
max         58.000000     105.000000         66.666667
In [42]:
## Correlation Analysis between lounge usage and flight delays
correlation_lounge_delay = customer_segmentation['Lounge_Users'].corr(customer_segmentation['Delayed_Flights'])
print("\nCorrelation between Lounge Usage and Delayed Flights: ", correlation_lounge_delay)
Correlation between Lounge Usage and Delayed Flights:  0.4243717768707256
In [58]:
## Predictive Analysis for Delta Airlines using historical data to forecast trends such as passenger demand, flight delays, and potential disruptions like strike
## Using time series forecasting for passenger demand
## Grouping the cleaned dataset by 'Record_Creation' to forecast passenger demand 
## Split data into training and testing sets
## Apply Holt-Winters Exponential Smoothing for forecasting
## Make predictions for the test set
from statsmodels.tsa.holtwinters import ExponentialSmoothing
delta_data_Cleaned['Record_Creation'] = pd.to_datetime(delta_data_Cleaned['Record_Creation'])
passenger_demand = delta_data_Cleaned.groupby('Record_Creation').size().resample('M').sum()
In [47]:
train_size = int(len(passenger_demand) * 0.8)
train_data, test_data = passenger_demand[:train_size], passenger_demand[train_size:]
model = ExponentialSmoothing(train_data, seasonal='add', seasonal_periods=12).fit()
predictions = model.forecast(len(test_data))
/Users/cactusjack/opt/anaconda3/lib/python3.8/site-packages/statsmodels/tsa/holtwinters/model.py:427: FutureWarning: After 0.13 initialization must be handled at model creation
  warnings.warn(
In [48]:
plt.figure(figsize=(10, 6))
plt.plot(train_data.index, train_data, label='Train Data')
plt.plot(test_data.index, test_data, label='Test Data')
plt.plot(predictions.index, predictions, label='Predicted Passenger Demand', color='red')
plt.title('Passenger Demand Forecast')
plt.xlabel('Date')
plt.ylabel('Passenger Count')
plt.legend()
plt.show()
In [49]:
covariance = delta_data_Cleaned[['Flight_Delayed', 'Lounge_Used',]].cov()
print("Covariance between Flight Delayed and Customer Lounge Usage:\n", covariance)
Covariance between Flight Delayed and Customer Lounge Usage:
                 Flight_Delayed  Lounge_Used
Flight_Delayed        0.249465    -0.001003
Lounge_Used          -0.001003     0.249038
In [71]:
print(delta_data_Cleaned[['Flight_Delayed','Lounge_Used']].isna().sum())
Flight_Delayed    530
Lounge_Used       524
dtype: int64
In [72]:
Cleaned_data = delta_data_Cleaned[['Flight_Delayed','Lounge_Used']].dropna()
In [73]:
Cleaned_data['Flight_Delayed'] = pd.to_numeric(Cleaned_data['Flight_Delayed'], errors='coerce')
Cleaned_data['Lounge_Used'] = pd.to_numeric(Cleaned_data['Lounge_Used'], errors='coerce')
In [76]:
cov_matrix = cov_matrix.fillna(0)
In [78]:
print(delta_data_Cleaned.columns)
Index(['Record_Creation', 'Frequent_Flier_Number', 'Join_Date', 'First_Name',
       'Last_Name', 'Inquiry_Type', 'Most_Recent_Flight_Number', 'Lounge_Used',
       'Departing_Airport', 'Arrival_Airport', 'Planned_Snack',
       'Additional_Snack', 'Included_Checked_Bags', 'Total_Checked_Bags',
       'Flight_Delayed'],
      dtype='object')
In [79]:
delta_data_Cleaned['Flight_Delayed'] = pd.to_numeric(delta_data_Cleaned['Flight_Delayed'], errors='coerce')
delta_data_Cleaned['Lounge_Used'] = pd.to_numeric(delta_data_Cleaned['Lounge_Used'], errors='coerce')
In [80]:
delta_data_Cleaned = delta_data_Cleaned.dropna(subset=['Flight_Delayed', 'Lounge_Used'])
In [81]:
print(delta_data_Cleaned[['Flight_Delayed', 'Lounge_Used']])
Empty DataFrame
Columns: [Flight_Delayed, Lounge_Used]
Index: []
In [51]:
import plotly.express as px
import plotly.graph_objects as go
In [65]:
## Prepare data
passenger_trends = delta_data_Cleaned.groupby('Record_Creation').size().reset_index(name='Passenger_Count')

## Plot interactive line chart
fig = px.line(
    passenger_trends,
    x='Record_Creation',
    y='Passenger_Count',
    title='Passenger Demand Over Time',
    labels={'Passenger_Count': 'Number of Passengers'},
    template='plotly_dark')
fig.show()
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: